import pandas as pd
import numpy as np
import os
import datetime
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn import tree
import pytz
import itertools
import visualize
import utils
import pydotplus
from sklearn import metrics
from sklearn import ensemble
import pvlib
import cs_detection
import visualize_plotly as visualize
# import visualize
# from bokeh.plotting import output_notebook
# output_notebook()
from IPython.display import Image
%load_ext autoreload
%autoreload 2
np.set_printoptions(precision=4)
%matplotlib notebook
Read pickled data from data exploration notebook.
nsrdb = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', compression='gzip')
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz', compression='gzip')
nsrdb.calc_all_window_metrics(3, 30, col1='GHI', col2='Clearsky GHI',
ratio_label='ratio', abs_ratio_diff_label='abs_diff_ratio')
nsrdb.df.keys()
feature_cols = ['GHI', 'Clearsky GHI',
'abs_diff_ratio', 'GHI mean', 'GHI std', 'GHI max', 'GHI min',
'Clearsky GHI mean', 'Clearsky GHI std',
'Clearsky GHI max', 'Clearsky GHI min',
'abs_diff_ratio mean', 'abs_diff_ratio std', 'abs_diff_ratio max',
'abs_diff_ratio min',
'GHI gradient',
'GHI gradient mean', 'GHI gradient std', 'GHI gradient max',
'GHI gradient min',
'Clearsky GHI gradient',
'Clearsky GHI gradient mean', 'Clearsky GHI gradient std',
'Clearsky GHI gradient max', 'Clearsky GHI gradient min',
'abs_diff_ratio gradient',
'abs_diff_ratio gradient mean', 'abs_diff_ratio gradient std',
'abs_diff_ratio gradient max', 'abs_diff_ratio gradient min',
'GHI line length', 'Clearsky GHI line length', 'abs_ratio_diff line length']
target_cols = ['sky_status']
# clf = tree.DecisionTreeClassifier(class_weight='balanced', max_leaf_nodes=len(feature_cols))
clf = ensemble.RandomForestClassifier(class_weight='balanced', max_leaf_nodes=24, n_estimators=100)
clf = nsrdb.fit_model(feature_cols, target_cols, clf)
Training vs the clearsky model in NSRDB is quite accurate. I don't really want to use this clearsky curve though since it's unavailable for ground based measurements.
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
clf.fit(train.df[feature_cols], train.df[target_cols])
pred = clf.predict(test.df[feature_cols]).flatten()
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'Both')
vis.show()
nsrdb.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
nsrdb.calc_all_window_metrics(3, 30, col1='GHI', col2='Clearsky GHI pvlib',
ratio_label='ratio', abs_ratio_diff_label='abs_diff_ratio', overwrite=True)
nsrdb.df.keys()
feature_cols = ['GHI', 'Clearsky GHI pvlib',
'abs_diff_ratio', 'GHI mean', 'GHI std', 'GHI max', 'GHI min',
'Clearsky GHI pvlib mean', 'Clearsky GHI pvlib std',
'Clearsky GHI pvlib max', 'Clearsky GHI pvlib min',
'abs_diff_ratio mean', 'abs_diff_ratio std', 'abs_diff_ratio max',
'abs_diff_ratio min',
'GHI gradient',
'GHI gradient mean', 'GHI gradient std', 'GHI gradient max',
'GHI gradient min',
'Clearsky GHI pvlib gradient',
'Clearsky GHI pvlib gradient mean', 'Clearsky GHI pvlib gradient std',
'Clearsky GHI pvlib gradient max', 'Clearsky GHI pvlib gradient min',
'abs_diff_ratio gradient',
'abs_diff_ratio gradient mean', 'abs_diff_ratio gradient std',
'abs_diff_ratio gradient max', 'abs_diff_ratio gradient min',
'GHI line length', 'Clearsky GHI pvlib line length', 'abs_ratio_diff line length']
target_cols = ['sky_status']
clf = ensemble.RandomForestClassifier(class_weight='balanced', max_leaf_nodes=24, n_estimators=100)
clf = nsrdb.fit_model(feature_cols, target_cols, clf)
Using the PVLib clearsky yields similar results to the NSRDB clearsky curve. The advantage of the PVLib curve is we can compute it ground based measurements if we have the location.
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
clf.fit(train.df[feature_cols], train.df[target_cols])
pred = clf.predict(test.df[feature_cols]).flatten()
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'Both')
vis.show()
nsrdb.scale_model('GHI', 'Clearsky GHI stat smooth', 'sky_status')
nsrdb.calc_all_window_metrics(3, 30, col1='GHI', col2='Clearsky GHI stat smooth',
ratio_label='ratio', abs_ratio_diff_label='abs_diff_ratio', overwrite=True)
nsrdb.df.keys()
feature_cols = ['GHI', 'Clearsky GHI stat smooth',
'abs_diff_ratio', 'GHI mean', 'GHI std', 'GHI max', 'GHI min',
'Clearsky GHI stat smooth mean', 'Clearsky GHI stat smooth std',
'Clearsky GHI stat smooth max', 'Clearsky GHI stat smooth min',
'abs_diff_ratio mean', 'abs_diff_ratio std', 'abs_diff_ratio max',
'abs_diff_ratio min',
'GHI gradient',
'GHI gradient mean', 'GHI gradient std', 'GHI gradient max',
'GHI gradient min',
'Clearsky GHI stat smooth gradient',
'Clearsky GHI stat smooth gradient mean', 'Clearsky GHI stat smooth gradient std',
'Clearsky GHI stat smooth gradient max', 'Clearsky GHI stat smooth gradient min',
'abs_diff_ratio gradient',
'abs_diff_ratio gradient mean', 'abs_diff_ratio gradient std',
'abs_diff_ratio gradient max', 'abs_diff_ratio gradient min',
'GHI line length', 'Clearsky GHI stat smooth line length', 'abs_ratio_diff line length']
target_cols = ['sky_status']
clf = ensemble.RandomForestClassifier(class_weight='balanced', max_leaf_nodes=24, n_estimators=100)
clf = nsrdb.fit_model(feature_cols, target_cols, clf)
Using the smoothed statistical clearsky curve gives results that are slightly less accurate than the NSRDB and PVLib curves. This is also a candidate to be used in future work since it can be computed for ground based measruements (even without location information).
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
clf.fit(train.df[feature_cols], train.df[target_cols])
pred = clf.predict(test.df[feature_cols]).flatten()
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI stat smooth'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'Both')
vis.show()
All three methods perform admirably. It looks like 'obvious' errors are few and far between. The big interest now is which method will perform best when NSRDB data is trained and tested on ground data.
Only making ground predictions using PVLib clearsky model and statistical model. NSRDB model won't be available to ground measurements.
nsrdb = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz')
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
We will reduce the frequency of ground based measurements to match NSRDB.
ground.intersection(nsrdb.df.index)
nsrdb.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
ground.trim_dates('10-01-2015', None)
nsrdb.calc_all_window_metrics(3, 30, col1='GHI', col2='Clearsky GHI pvlib',
ratio_label='ratio', abs_ratio_diff_label='abs_diff_ratio', overwrite=True)
ground.calc_all_window_metrics(3, 30, col1='GHI', col2='Clearsky GHI pvlib',
ratio_label='ratio', abs_ratio_diff_label='abs_diff_ratio', overwrite=True)
feature_cols = ['GHI', 'Clearsky GHI pvlib',
'abs_diff_ratio', 'GHI mean', 'GHI std', 'GHI max', 'GHI min',
'Clearsky GHI pvlib mean', 'Clearsky GHI pvlib std',
'Clearsky GHI pvlib max', 'Clearsky GHI pvlib min',
'abs_diff_ratio mean', 'abs_diff_ratio std', 'abs_diff_ratio max',
'abs_diff_ratio min',
'GHI gradient',
'GHI gradient mean', 'GHI gradient std', 'GHI gradient max',
'GHI gradient min',
'Clearsky GHI pvlib gradient',
'Clearsky GHI pvlib gradient mean', 'Clearsky GHI pvlib gradient std',
'Clearsky GHI pvlib gradient max', 'Clearsky GHI pvlib gradient min',
'abs_diff_ratio gradient',
'abs_diff_ratio gradient mean', 'abs_diff_ratio gradient std',
'abs_diff_ratio gradient max', 'abs_diff_ratio gradient min',
'GHI line length', 'Clearsky GHI pvlib line length',
'abs_ratio_diff line length']
target_cols = ['sky_status']
train = cs_detection.ClearskyDetection(nsrdb.df)
test = cs_detection.ClearskyDetection(ground.df)
clf = ensemble.RandomForestClassifier(class_weight='balanced', max_leaf_nodes=24, n_estimators=100)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3)
pred = pred.astype(bool)
train.intersection(test.df.index)
metrics.accuracy_score(train.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(train.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(train.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(train.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')
vis.show()
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.trim_dates('10-01-2015', '11-01-2015')
test = ground
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 10, smooth=True)
pred = pred.astype(bool)
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (~pred)]['GHI'], 'PVLib clear only')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (pred)]['GHI'], 'ML+PVLib clear only')
vis.show()
Only making ground predictions using PVLib clearsky model and statistical model. NSRDB model won't be available to ground measurements.
nsrdb = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz')
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
We will reduce the frequency of ground based measurements to match NSRDB.
nsrdb.scale_model('GHI', 'Clearsky GHI stat smooth', 'sky_status')
ground.intersection(nsrdb.df.index)
ground.trim_dates('10-01-2015', None)
nsrdb.calc_all_window_metrics(3, 30, col1='GHI', col2='Clearsky GHI stat smooth',
ratio_label='ratio', abs_ratio_diff_label='abs_diff_ratio', overwrite=True)
ground.calc_all_window_metrics(3, 30, col1='GHI', col2='Clearsky GHI stat smooth',
ratio_label='ratio', abs_ratio_diff_label='abs_diff_ratio', overwrite=True)
feature_cols = ['GHI', 'Clearsky GHI stat smooth',
'abs_diff_ratio', 'GHI mean', 'GHI std', 'GHI max', 'GHI min',
'Clearsky GHI stat smooth mean', 'Clearsky GHI stat smooth std',
'Clearsky GHI stat smooth max', 'Clearsky GHI stat smooth min',
'abs_diff_ratio mean', 'abs_diff_ratio std', 'abs_diff_ratio max',
'abs_diff_ratio min',
'GHI gradient',
'GHI gradient mean', 'GHI gradient std', 'GHI gradient max',
'GHI gradient min',
'Clearsky GHI stat smooth gradient',
'Clearsky GHI stat smooth gradient mean', 'Clearsky GHI stat smooth gradient std',
'Clearsky GHI stat smooth gradient max', 'Clearsky GHI stat smooth gradient min',
'abs_diff_ratio gradient',
'abs_diff_ratio gradient mean', 'abs_diff_ratio gradient std',
'abs_diff_ratio gradient max', 'abs_diff_ratio gradient min',
'GHI line length', 'Clearsky GHI stat smooth line length',
'abs_ratio_diff line length']
target_cols = ['sky_status']
train = cs_detection.ClearskyDetection(nsrdb.df)
test = cs_detection.ClearskyDetection(ground.df)
clf = ensemble.RandomForestClassifier(class_weight='balanced', max_leaf_nodes=24, n_estimators=100)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict(feature_cols, 'GHI', 'Clearsky GHI stat smooth', clf, 3)
pred=pred.astype(bool)
train.intersection(test.df.index)
metrics.accuracy_score(train.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI stat smooth'], 'GHI_cs')
vis.add_circle_ser(test.df[(train.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(train.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(train.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')
vis.show()
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
# ground.calc_all_window_metrics(1, 10, col1='GHI', col2='Clearsky GHI stat smooth',
# ratio_label='ratio', abs_ratio_diff_label='abs_diff_ratio', overwrite=True)
ground.trim_dates('10-01-2015', '11-01-2015')
test = ground
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict(feature_cols, 'GHI', 'Clearsky GHI stat smooth', clf, 10, smooth=True)
pred = pred.astype(bool)
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI stat smooth'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (~pred)]['GHI'], 'PVLib clear only')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (pred)]['GHI'], 'ML+PVLib clear only')
vis.show()